#!/usr/bin/env python
# coding: utf-8

# This script accompanies the manuscript "Leveraging Researcher Domain Expertise to Annotate Concepts within Imbalanced Data".
# 
# In that paper, we describe a method for combining researcher domain expertise with an unsupervised exploration of the latent semantic space, to annotate theory-driven categories for a supervised classifier.
# 
# Here, we lay out the procedure for the simulations we ran to compare empirically our proposed method and two leading annotation approaches - random sampling and active learning.
# 
# We ran these simulations on two datasets:
# 
# 1) 20 NewsGroups
# 
# 2) New York Times Front Page Dataset (Boydstun, 2013) with Comparative Agendas Project (CAP) categories (Dowding et al., 2015)
# 
# In this script, we run on the New York Times Front Page dataset, but the procedure is the same for any fully-classified corpus. We have also prepared and uploaded document-level embeddings for the 20 NewsGroups dataset for replication of two of the simulations cited within our paper, yet such simulations run on slightly different category distribution schemes (and are explained in a script accompanying this one).
# 
# May 2022

# # Create embeddings for the full corpus

# We start with our dataset, wanting to convert each document into embeddings on the shared latent semantic space. While a number of document-embedding techniques exist, here we use SentenceTransformers (https://huggingface.co/sentence-transformers/stsb-mpnet-base-v2) - pre-trained language models optimized for semantic similarity at the sentence level (Reimers and Gurevych, 2019). However, any other sentence or document embedding model could be used for this preparation stage.
# 
# The SentenceTransformers are trained for the task of converting sentences (text segments) into embeddings. The New York Times Front page dataset includes full documents (having extracted the full articles to the existing dataset via LexisNexis). Thus, in that case we split the full articles into sentence segments, before converting each such segment into an embedding. Then, we found the mean vector for the document-level embedding.
# 
# We have uploaded the document-embeddings for both datasets, but we note that there exist many different embedding models that users can try to utilize for improved results (or, models more relevant for their specific research cases). 

# In[ ]:


import pandas as pd
import os
import numpy as np
import pickle


# In[ ]:


### DIRECTORY WITH THE DATASET:
## change to the directory location on user computer

DIR = r"E:\Simulation_Replication Materials"


# In[ ]:


## Load the embeddings for the corpus
embds = os.path.join(DIR, "NYT_FrontPage_forSimulation_Embeddings.pickle")

pickle_in = open(embds,"rb")
embeddings = pickle.load(pickle_in)


# In[ ]:


## Load the CSV with the metadata for each embedding - text, category, etc.

df = pd.read_csv(os.path.join(DIR, 'NYT_FrontPage_forSimulation_Metadata.csv'))


# # Running the Simulations

# At this stage we have our dataset fully converted to document embeddings, as well as our original dataframe containing the metadata necessary for this simulation - each document's true labels.
# 
# For the simulation itself, we will choose randomly a single domain (i.e., a theoretical area of interest) to focus on in each run. We will then aggregate the rest of the domains to a single category 'Other'. Then, in the simulation itself, the classifiers will be tasked with classifying the multiple sub-categories within the domain.

# In[ ]:


### Add the document embeddings to the dataframe

df['X'] = [np.array(x) for x in embeddings]


# In[ ]:


### Create lists of major categories, sub-categories:

CATEGORIES = list(set(df['Category']))
DOMAINS = list(set(df['Domain']))


# In[ ]:


## Keep only those Domains with at least 1000 articles to allow the simulation to run fully
TOP_DOMAINS = ["International Affairs and Foreign Aid", "Defense", 
               "Government Operations", "Law, Crime, and Family Issues", 
               "Health", "Banking, Finance, and Domestic Commerce"]


# In[ ]:


## Create the randomizer functions to randomly choose the major topic domains in each simulation round:

import random

## Choose the random seeding to allow for replication of the procedure
np.random.seed(17)

def choose_random_domain(domains=DOMAINS):
    ## Return a single randomly chosen major topic domain
    return(random.sample(domains, 1))

def filter_small_categories(df, domain_to_keep):
    ## In some cases, some subtopics were too small even for the preliminary sampling stage
    ## Thus, we include a filter to remove these categories from our simulation:

    temp = df[df['Domain'] == domain_to_keep]
    
    ## Create list of the sub-topics:
    categories_ = list(set(temp['Category']))
    
    ## Create a new, empty dataframe to begin to fill with the categories we will target
    new_df = pd.DataFrame()
    
    ## Run through the list of sub-topics to make sure they contain at least 100 documents
    for cat in categories_:
        subset = temp[temp['Category'] == cat]
        if len(subset) >= 100:
            new_df = pd.concat([new_df, subset])
    
    ## Now, join the filtered domain of interest back to the rest of the dataframe
    other2 = df[df['Domain'] != domain_to_keep]
    new_df = pd.concat([new_df, other2])
            
    return new_df

def change_labels(domain_to_keep, df_):
    ## Keep the domain of interest, while aggregating the other majortopics to a single 'Other' category
    
    new_df = df_.copy()
    
    ## Change all the other domains to other
    other_df = new_df[new_df['Domain'] != domain_to_keep]
    other_df['Category'] = 'Other'
    
    ## Keep only the categories from the target domain
    subset = new_df[new_df['Domain'] == domain_to_keep]
    
    ## Join together for the new dataframe ready for the simulation
    changed_df = pd.concat([other_df, subset])

    return changed_df


# Here we choose the parameters for the simulation:

# In[ ]:


## How many times to run the simulation:
RUNS = 300

## How many samples to extract at each sampling iteration:
STEPSIZE = 18

## Number of iterations for each simulation run 
## (The more iterations, the more sampling rounds executed)
NUM_STEPS = 100


# In[ ]:


## Imports for simulation functions:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.stats import entropy
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

## TQDM Notebook - not required, but allows the user to track simulation progress:
from tqdm import tqdm_notebook


# In[ ]:


## Additional functions needed for the simulation:

## Trains an SVM classification model on X and y and returns the classifier
def train_svm(X, y, proba=False): # add hyperparameter tuning
    clf = SVC(probability=proba)
    ## Fit the model on the training set:
    clf.fit(X, y)
    return clf

## Takes as an input a trained SVM model, runs on the test set, and returns the accuracy results
def create_classification_report(clf, testX, testY, output_dict=True):
    ## Use the model to make predictions on the test set:
    predY = clf.predict(testX)
    results = classification_report(le.inverse_transform([int(x) for x in testY]), le.inverse_transform([int(x) for x in predY]),
                                    output_dict=output_dict)
    return results

## For our method, after the initial sampling and each ensuing round of stratified sampling, 
## need to calculate the centroids for each concept of interest

## At each stage, run on the dataframe of the expanding training set
def calculate_centroids(df, categories):
    centroids_ = []
    
    for c in categories:
        ## Run over each subcategory:
        temp = df[df['y'] == c]
        
        vecs = []
        ## Find the mean for the document embbedings - the centroid
        for i,r in temp.iterrows():
            vecs.append(r.X)
        meanvec = np.mean(vecs, axis=0)
        centroids_.append(meanvec)
    
    ## Return the centroids
    return centroids_


# Now, we run the actual simulation. Each run begins with the compilation of an initial training set, based on each strategy we want to compare: Completely random sampling for "random sampling" and "active learning", and a sampling of each target category for our method - corresponding to the role of human experts in the creation of the core training set.
# 
# In the next runs, we continue to expand the three training sets in parallel. For random sampling we simply add additional samples, for active learning we add those instances the model is most uncertain on, and for our method we utilize stratified sampling.
# 
# Between each iteration, we train a SVM model on the updated training sets for each parallel method, and record the accuracy scores for comparison later.

# In[ ]:


### Function to actually execute the simulation for each run

def run_annotations(pooldf, filtered_categories, test, STEPSIZE=STEPSIZE, NUM_STEPS=NUM_STEPS):
    ## First, define a number of lists and empty dataframes which we will continue to fill through each iteration
    ## These will be returned in the output and will later be used to record the efficiency and accuracy
    ## of each method:    
    
    ## List to record the number of samples in the training set at each iteration
    samples = [] # number of samples

    ## Lists to hold the f1 accuracy scores for each method over each iteration
    f1s_random = [] # random sampling
    f1s_active = [] # active learning
    ## Note - we will run two versions of our stratified sampling method in this simulation to offer
    ## a more comprehensive exploration of our method. The only thing we change here are the sizes and distances
    ## of the hierarchal strata. This is due to the varying densities and distances of the latent semantic spaces for
    ## different corpora/datasets. However, the guiding principle remains the same.
    f1s_ours_original = [] # Equally-spaced strata covering the full 0-1 cosine distance from each centroid
    f1s_ours_closer = [] # Smaller strata focusing on the areas closer to the centroids, not covering the full distance

    ## The pool of unlabelled documents left for each method
    ## At the start, the whole corpus is still unlabelled:
    pooldf_random = pooldf.copy()
    pooldf_active = pooldf.copy()
    pooldf_ours_original = pooldf.copy()
    pooldf_ours_closer = pooldf.copy()

    ## The collection of annotated samples - i.e., the expanding training set
    ## We start with the initial sampling based on each approach:
    ## For random sampling and guided search, take a random sample from the full dataset:
    annotated_random = pooldf_random.sample(20)
    ## However, due to the imbalance in the data, need to ensure at least two categories present 
    ## in the training set for the SVM model to work:
    if len(set(annotated_random['y'])) == 1:
        annotated_random.drop(annotated_random.tail(1).index,inplace=True)
        annotated_random = annotated_random.append(pooldf_ours_original.groupby("y").sample(1))
    annotated_active = annotated_random.copy()
    ## For our approach, emulating the human element we sample from each category:
    annotated_ours_original = pooldf_ours_original.groupby("y").sample(20) #  - start with samples from all
    annotated_ours_closer = annotated_ours_original.copy()

    ## Remove the annotated texts from the unlabelled pool
    pooldf_random = pooldf_random.drop(annotated_random.index)
    pooldf_active = pooldf_active.drop(annotated_active.index)
    pooldf_ours_closer = pooldf_ours_closer.drop(annotated_ours_closer.index)
    pooldf_ours_original = pooldf_ours_original.drop(annotated_ours_original.index)
    
    ###############################################################################################
    ################################ BEGIN SAMPLING ITERATIONS ####################################
    ###############################################################################################
    
    ## Main loop - based on the number of iterations defined earlier:
    for i in range(NUM_STEPS):
        
        ### add in the first step as taking the preliminary sample collection
        
        if i == 0:
            annotated_ours_original = annotated_ours_original
            annotated_ours_closer = annotated_ours_closer
            annotated_random = annotated_random
            annotated_active = annotated_active
        
        elif i > 0:
        
            ## Now, run through each method in parallel:

            ############################ OUR METHOD - ORIGINAL #################################
            ### Our method - sample from each strata for a category chosen at random

            c = calculate_centroids(annotated_ours_original, filtered_categories) # Find the centroids for each category
            similarities = cosine_similarity(np.array(pooldf_ours_original["X"].to_list()), c) # Find distances of all documents from each centroid

            for ii, cc in enumerate(filtered_categories): # Add column with distance from each centroid for each document in pool
                pooldf_ours_original[cc] = [x[ii] for x in similarities]

            ## The hierarchal strata:
            strata_o = [(0.8, 0.9), (0.7, 0.8), (0.6, 0.7), (0.5, 0.6), (0.4, 0.5), 
                      (0.3 ,0.4)] 

            ## Randomly choose category
            cat_choice = random.choice(filtered_categories)
            ## Calculate empirical strata for category over full distance
            emp_strata = [np.quantile(pooldf_ours_original[str(cat_choice)], [x, y]) for x, y in strata_o]

            new_ours_orig = []

            ## The stratified sampling:
            for strata in strata_o:
                ## Select only subset corresponding to specific strata
                temp = pooldf_ours_original[(pooldf_ours_original[str(cat_choice)] > strata[0]) & (pooldf_ours_original[str(cat_choice)] <= strata[1])]

                if len(temp) < (STEPSIZE / (len(strata_o))): # Ensure no errors due to sampling larger than the strata
                    newsamp = temp
                    new_ours_orig.append(newsamp)
                else:
                    newsamp = temp.sample(int(STEPSIZE / (len(strata_o))))
                    new_ours_orig.append(newsamp)

                ## Remove samples from remaining pool
                pooldf_ours_original = pooldf_ours_original.drop(newsamp.index)

            ## The stratified samples:
            new_ours_original = pd.concat(new_ours_orig)

            ############################ OUR METHOD - MODIFIED STRATA #################################
            ### Our method, but strata closer to the centroids

            c = calculate_centroids(annotated_ours_closer, filtered_categories)
            similarities = cosine_similarity(np.array(pooldf_ours_closer["X"].to_list()), c)

            for ii, cc in enumerate(filtered_categories):
                pooldf_ours_closer[cc] = [x[ii] for x in similarities]

            ## Closer strata for larger, more spread out corpus
            strata_o = [(0.95, 1), (0.9, 0.95), (0.85, 0.9), (0.8, 0.85), (0.7, 0.8), 
                      (0.6 ,0.7)]   

            cat_choice = random.choice(filtered_categories)
            emp_strata = [np.quantile(pooldf_ours_closer[str(cat_choice)], [x, y]) for x, y in strata_o]

            new_ours_closer = []

            for strata in strata_o:
                temp = pooldf_ours_closer[(pooldf_ours_closer[str(cat_choice)] > strata[0]) & (pooldf_ours_closer[str(cat_choice)] <= strata[1])]

                if len(temp) < (STEPSIZE / (len(strata_o))):
                    newsamp = temp
                    new_ours_closer.append(newsamp)
                else:
                    newsamp = temp.sample(int(STEPSIZE / (len(strata_o))))
                    new_ours_closer.append(newsamp)

                pooldf_ours_closer = pooldf_ours_closer.drop(newsamp.index)

            new_ours_closer = pd.concat(new_ours_closer)


            ############################ RANDOM SAMPLING #############################################
            new_random = pooldf_random.sample(STEPSIZE) 
            pooldf_random = pooldf_random.drop(new_random.index)


            ############################ ACTIVE LEARNING #############################################
            ## Train svm on the training set
            clf = train_svm(annotated_active["X"].to_list(), annotated_active["y"].to_list(), proba=True)
            ## Add column with entropy of each prediction
            pooldf_active["metric"] = entropy(clf.predict_proba(pooldf_active["X"].to_list()).transpose())
            ## Sort by descending uncertainty
            pooldf_active = pooldf_active.sort_values(by="metric", ascending=False)
            ## Sample:
            new_active = pooldf_active.iloc[0:STEPSIZE]
            pooldf_active = pooldf_active.drop(new_active.index)


        ###############################################################################################
        ###################################### END SAMPLING STAGE #####################################
        ###############################################################################################

            ## Add newly sampled texts to each of the training sets
            annotated_random = pd.concat([annotated_random, new_random])
            annotated_active = pd.concat([annotated_active, new_active])
            annotated_ours_original = pd.concat([annotated_ours_original, new_ours_original])
            annotated_ours_closer = pd.concat([annotated_ours_closer, new_ours_closer])

        ## Train intermediate models
        model_random = train_svm(annotated_random["X"].to_list(), annotated_random["y"].to_list())
        model_active = train_svm(annotated_active["X"].to_list(), annotated_active["y"].to_list())
        model_ours_original = train_svm(annotated_ours_original["X"].to_list(), annotated_ours_original["y"].to_list())
        model_ours_closer = train_svm(annotated_ours_closer["X"].to_list(), annotated_ours_closer["y"].to_list())
 
        ## Evaulate and save results
        f1s_random.append(create_classification_report(model_random, test["X"].to_list(), test["y"]))
        f1s_active.append(create_classification_report(model_active, test["X"].to_list(), test["y"]))
        f1s_ours_closer.append(create_classification_report(model_ours_closer, test["X"].to_list(), test["y"]))
        f1s_ours_original.append(create_classification_report(model_ours_original, test["X"].to_list(), test["y"]))
        
        ## Save number of samples
        samples.append(annotated_random.shape[0])
        
        ## Return accuracy results for each method, number of samples
        to_return = [f1s_random, f1s_active, f1s_ours_original, f1s_ours_closer, samples]

    return to_return


# In[ ]:


## Convert categories to integer labels
from sklearn import preprocessing
le = preprocessing.LabelEncoder()


# In[ ]:


## Function to run the full simulation from start to finish (one run)
def run_simulation(pooldf, RUNS=RUNS, STEPSIZE=STEPSIZE, NUM_STEPS=NUM_STEPS):
    
    ## First, compile/prepare the dataframe for each run
    
    ## Choose a domain at random:
    dom_ = choose_random_domain(TOP_DOMAINS)[0]
    
    ## Then, prepare the dataframe around the chosen major topic/domain
    converted_ = change_labels(dom_, pooldf)
    converted_ = filter_small_categories(converted_, dom_)

    ## Assign numerical values for the categorical variable and store in another column
    converted_['y'] = le.fit_transform(converted_['Category'])
    
    ## Split the dataset into test set and pool set
    X_pool, X_test, y_pool, y_test = train_test_split(converted_['X'], converted_['y'], test_size=0.20, random_state=42)
    y_pool = [str(x) for x in y_pool]
    y_test = [str(x) for x in y_test]
    pooldf = pd.DataFrame({"X": X_pool.tolist(), "y": y_pool})
    test = pd.DataFrame({"X": X_test.tolist(), "y": y_test})
    
    new_categories = list(set(pooldf['y']))
    
    ## Run the simulation:
    outputs = run_annotations(pooldf, new_categories, test=test)
    
    return outputs


# In[ ]:


### Create dictionaries to hold all information for each run
## Each key will be a run and the values will correspond to the f1 scores and samples
## for each parallel method of the simulation

random_dict = {}
active_dict = {}
ours_dict_closer = {}
ours_original_dict = {}
samples_dict = {}


for run in tqdm_notebook(range(RUNS)):
    outputs = run_simulation(df) # Return the outputs from the simulation's run
    random_dict.setdefault(str(run),outputs[0])
    active_dict.setdefault(str(run),outputs[1])
    ours_original_dict.setdefault(str(run),outputs[2])
    ours_dict_closer.setdefault(str(run),outputs[3])
    samples_dict.setdefault(str(run),outputs[4])


# # Visualizing the Results

# One of the advantages of utilizing a computerized simulation is the ability to run multiple experiments/iterations of our empirical comparisons. Now, we can ascertain the average performance for each method being tested. This helps to ensure that the results we find are due to the approach chosen, and not to idiosyncrasies of category choice or other inherent random elements. 
# 
# Here we find the average accuracy levels (vector) for each method and compare via a graph.
# 
# The outputs are returned in dictionary form, and can be saved to file as pickles for revisiting later.

# In[ ]:


## Find the mean vector for each dictionary (each dictionary corresponds to an annotation strategy)
def extract_macros_mean(results_dict):
    vecs = []
    for row in results_dict:
        vec = []
        for i in (results_dict[row]):
            vec.append(i['macro avg']['f1-score'])
        vecs.append(vec)
    mean_ = np.mean([v for v in vecs], axis=0)
    return mean_


# In[ ]:


## Visualize the performances

import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
## Note, sample size remains consistent throughout the multiple runs
plt.plot(samples_dict['0'], extract_macros_mean(random_dict), label='Random Sampling')
plt.plot(samples_dict['0'], extract_macros_mean(active_dict), label='Active Learning')
plt.plot(samples_dict['0'], extract_macros_mean(ours_original_dict), label='Our Method')
plt.plot(samples_dict['0'], extract_macros_mean(ours_dict_closer), label='Our Method - modified')
plt.xlabel(str('# of training samples'), fontsize=14, fontweight='bold')
plt.ylabel(str('f1 (macro)'), fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.setp(plt.gca().get_legend().get_texts(), fontsize='14')
plt.tight_layout()

### To save the graph:
# plt.savefig('Comparing_Annotation_Approaches.png')


# In[ ]:




